import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import pandas_profiling
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
def read_amb_clean_dataset(file_path, sheet_name):
ground_truth = pd.read_excel(file_path, sheet_name=sheet_name) # read sheet normal
ground_truth = ground_truth.drop(ground_truth.index[0]) # delete first row.
ground_truth = ground_truth.replace('NoData', np.nan)
ground_truth = ground_truth.replace('nan', np.nan)
ground_truth = ground_truth.replace('<Samp', np.nan)
ground_truth = ground_truth.replace('---', np.nan)
ground_truth = ground_truth.replace('InVld', np.nan)
ground_truth["PM10"] = ground_truth.PM10.astype('float')
ground_truth["PM2.5"] = ground_truth["PM2.5"].astype('float')
ground_truth["Temp_Aire"] = ground_truth["Temp_Aire"].astype('float')
ground_truth["Lluvia"] = ground_truth.Lluvia.astype('float')
ground_truth["Humedad_relativa"] = ground_truth.Humedad_relativa.astype('float')
ground_truth["WD"] = ground_truth.WD.astype('float')
ground_truth["WS"] = ground_truth.WS.astype('float')
ground_truth["R_Solar"] = ground_truth.R_Solar.astype('float')
ground_truth["Date&Time"] = pd.to_datetime(ground_truth["Date&Time"], format = '%d/%m/%Y/%H:%M:%S')
ground_truth = ground_truth.fillna(ground_truth.mean())
ground_truth.set_index('Date&Time', inplace=True)
print("La AMB tomó {} registros".format(len(ground_truth)))
return ground_truth
def read_sensor_clean_dataset(file_path):
ground_truth = pd.read_csv(file_path) # read sheet normal
ground_truth = ground_truth.replace('NoData', np.nan)
ground_truth = ground_truth.replace('nan', np.nan)
ground_truth = ground_truth.replace('<Samp', np.nan)
ground_truth = ground_truth.replace('---', np.nan)
ground_truth = ground_truth.replace('InVld', np.nan)
ground_truth = ground_truth.fillna(ground_truth.mean())
ground_truth["df_index"] = ground_truth.index
ground_truth["fecha_hora_med"] = ground_truth["fecha_hora_med"].replace("T"," ")
ground_truth["fecha_hora_med"] = pd.to_datetime(ground_truth["fecha_hora_med"])
ground_truth = ground_truth.drop_duplicates( keep = False)
ground_truth.set_index('fecha_hora_med', inplace=True)
return ground_truth
amb_file_path = "datos/Datos Estaciones AMB.xlsx"
amb_sheet_name = "Normal"
amb_data_df = read_amb_clean_dataset(amb_file_path, amb_sheet_name)
amb_data_df.profile_report()
first_file_path = "datos/mediciones_clg_normalsup_pm25_a_2018-11-01T00_00_00_2018-11-30T23_59_59.csv"
first_df = read_sensor_clean_dataset(first_file_path)
second_file_path = "datos/mediciones_clg_normalsup_pm25_a_2018-12-01T00_00_00_2018-12-31T23_59_59.csv"
second_df = read_sensor_clean_dataset(second_file_path)
third_file_path = "datos/mediciones_clg_normalsup_pm25_a_2019-04-01T00_00_00_2019-04-30T23_59_59.csv"
third_df = read_sensor_clean_dataset(third_file_path)
fourth_file_path = "datos/mediciones_clg_normalsup_pm25_a_2019-05-01T00_00_00_2019-05-31T23_59_59.csv"
fourth_df = read_sensor_clean_dataset(fourth_file_path)
fifth_file_path = "datos/mediciones_clg_normalsup_pm25_a_2019-06-01T00_00_00_2019-06-30T23_59_59.csv"
fifth_df = read_sensor_clean_dataset(fifth_file_path)
sixth_file_path = "datos/mediciones_clg_normalsup_pm25_a_2019-07-01T00_00_00_2019-07-31T23_59_59.csv"
sixth_df = read_sensor_clean_dataset(sixth_file_path)
seventh_file_path = "datos/mediciones_clg_normalsup_pm25_a_2019-08-01T00_00_00_2019-08-31T23_59_59.csv"
seventh_df = read_sensor_clean_dataset(seventh_file_path)
sensor_df = pd.concat([first_df, second_df, third_df,
fourth_df, fifth_df, sixth_df,
seventh_df])
sensor_df = sensor_df.sort_index()
amb_data_df = amb_data_df.sort_index()
sensor_df.index = sensor_df.index.tz_localize(None)
amb_data_df.index = amb_data_df.index.tz_localize(None)
print("Original sensor data")
display(sensor_df)
sensor_df = sensor_df.resample('H').mean()
print("Sensor data rescaled to hours")
sensor_df
print("AMB reference data")
display(amb_data_df)
final_df = pd.merge(sensor_df, amb_data_df, how='inner', left_index=True, right_index=True)
final_df.dropna(axis=0, inplace=True)
final_df = final_df.sort_index()
print("Final Sensor and AMB data")
display(final_df)
plt.figure(figsize=(20,4))
plt.scatter(final_df.index,final_df.valor, color='red', marker='o', label="Sensor data")
plt.scatter(final_df.index,final_df["PM2.5"], color='blue', marker='o', label="AMB reference data")
plt.legend()
final_df = pd.merge(sensor_df, amb_data_df, how='inner', left_index=True, right_index=True)
final_df.dropna(axis=0, inplace=True)
final_df = final_df.sort_index()
plt.figure(figsize=(20,4))
plt.plot(final_df.index,final_df.valor, color='red', label="Sensor data")
plt.plot(final_df.index,final_df["PM2.5"], color='blue', label="AMB reference data")
plt.legend()
display(final_df)
distancia_euclidea = np.sqrt(np.sum((final_df["valor"] - final_df["PM2.5"])**2))
print("The euclidean distance was {}".format(distancia_euclidea))
def get_results_for_roll(window=4):
print("=================================================================================")
print("Results with a rolling mean of {} elements in the window".format(window))
rolling_df = final_df.rolling(window=window).mean()
rolling_df.dropna(axis=0, inplace=True)
display(rolling_df)
distancia_euclidea = np.sqrt(np.sum((rolling_df["valor"] - rolling_df["PM2.5"])**2))
plt.figure(figsize=(20,4))
plt.title("Sensor data - AMB data")
plt.plot(rolling_df.index,rolling_df.valor, color='red', label="Sensor data rolling {}".format(window))
plt.plot(rolling_df.index,rolling_df["PM2.5"], color='blue', label="AMB reference rolling {}".format(window))
plt.legend()
plt.show()
print("The euclidean distance was {}".format(distancia_euclidea))
print("=================================================================================")
return window, distancia_euclidea
windows = []
euclidean_distances = []
for i in range(10):
w, e = get_results_for_roll(2**i)
windows.append(w)
euclidean_distances.append(e)
plt.figure(figsize=(20,4))
plt.plot(windows,euclidean_distances, color='orange', label="euclidean distance")
plt.ylabel("Error")
plt.xlabel("Window size")
plt.legend()
plt.show()
stats_df = final_df[["valor", "PM2.5"]]
stats_df.profile_report()
def get_results_for_roll_correct(window=4):
from sklearn import linear_model
print("=================================================================================")
print("Results with a rolling mean of {} elements in the window".format(window))
rolling_df = final_df.rolling(window=window).mean()
rolling_df.dropna(axis=0, inplace=True)
reg = linear_model.LinearRegression()
reg.fit(y=rolling_df[['PM2.5']], X=rolling_df[['valor']].values.reshape(-1,1))
m = reg.coef_[0,0]
b = reg.intercept_[0]
print("The coefficient m={:.2f}, and intercept b={:.2f}".format(m,b))
rolling_df["PM2.5_pred"] = m*rolling_df.valor.values + b
plt.figure(figsize=(20,4))
plt.title("Sensor data - AMB data")
plt.plot(rolling_df.index,rolling_df["PM2.5_pred"], color='red', label="Sensor corrected rolling {}".format(window))
plt.plot(rolling_df.index,rolling_df["PM2.5"], color='green', label="AMB reference rolling {}".format(window))
plt.legend()
plt.show()
distancia_euclidea = np.sqrt(np.sum((rolling_df["PM2.5"] - rolling_df["PM2.5_pred"])**2))
print("The euclidean distance was {}".format(distancia_euclidea))
print("=================================================================================")
return window, distancia_euclidea
windows_c = []
euclidean_distances_c = []
for i in range(10):
w, e = get_results_for_roll_correct(2**i)
windows_c.append(w)
euclidean_distances_c.append(e)
plt.figure(figsize=(20,4))
plt.plot(windows_c,euclidean_distances_c, color='green', label="euclidean distance with correction")
plt.plot(windows,euclidean_distances, color='orange', label="euclidean distance without correction")
plt.ylabel("Error")
plt.xlabel("Window size")
plt.legend()
plt.show()
final_df